In [269]:
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.base import TransformerMixin
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
In [270]:
data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )
Since the dataset for this project is so small, a hold-out set will not be used, and only k-fold testing and training splits will be used to measure accuracy.
This is because even with a stratified hold-out set of 20%, with only 146 data points, lots of missing data and and 18 poi's, there would be only 3 or so points to do a final test on. This does not give much confidence in the precision of the performance metrics on such a small hold-out set, while also negatively impacting the ability to create the model.
"when the number of samples is not large, a strong case can be made that a test set should be avoided because every sample may be needed for model building. (...) Additionally, the size of the test set may not have sufficient power or precision to make reasonable judgements. "
[1] Kuhn M., Kjell J.(2013). Applied Predictive Modeling. Springer. pp.67
Hawkins et al. (2003) concisely summarize this point:“holdout samples of tolerable size [. . . ] do not match the cross-validation itself for reliability in assessing model fit and are hard to motivate.”
[2] Hawkins D, Basak S, Mills D (2003). “Assessing Model Fit by Cross– Validation.” Journal of Chemical Information and Computer Sciences, 43(2), 579–586
This will be addressed with K-fold cross-validation resampling techniques.
In [271]:
data_dict['BELFER ROBERT'] = {'bonus': 'NaN',
'deferral_payments': 'NaN',
'deferred_income': -102500,
'director_fees': 102500,
'email_address': 'NaN',
'exercised_stock_options': 'NaN',
'expenses': 3285,
'from_messages': 'NaN',
'from_poi_to_this_person': 'NaN',
'from_this_person_to_poi': 'NaN',
'loan_advances': 'NaN',
'long_term_incentive': 'NaN',
'other': 'NaN',
'poi': False,
'restricted_stock': -44093,
'restricted_stock_deferred': 44093,
'salary': 'NaN',
'shared_receipt_with_poi': 'NaN',
'to_messages': 'NaN',
'total_payments': 3285,
'total_stock_value': 'NaN'}
data_dict['BHATNAGAR SANJAY'] = {'bonus': 'NaN',
'deferral_payments': 'NaN',
'deferred_income': 'NaN',
'director_fees': 'NaN',
'email_address': 'sanjay.bhatnagar@enron.com',
'exercised_stock_options': 15456290,
'expenses': 137864,
'from_messages': 29,
'from_poi_to_this_person': 0,
'from_this_person_to_poi': 1,
'loan_advances': 'NaN',
'long_term_incentive': 'NaN',
'other': 'NaN',
'poi': False,
'restricted_stock': 2604490,
'restricted_stock_deferred': -2604490,
'salary': 'NaN',
'shared_receipt_with_poi': 463,
'to_messages': 523,
'total_payments': 137864,
'total_stock_value': 15456290}
In [484]:
df = pd.DataFrame.from_dict(data_dict, orient='index')
df = df.drop('TOTAL', axis=0)
Out[484]:
'NaN' was imported as a string instead of a missing value. We will convert these to NaN type and look how many missing values our data has.
In [485]:
# Replace 'NaN' strings with 0's
df = df.replace('NaN', 0)
# Replace email strings with True/False boolean as to whether an email was present or not
# df['email_address'] = df['email_address'].fillna(0).apply(lambda x: x != 0, 1)
# Remove 'email_address' string as a feature
del df['email_address']
Out[485]:
In [464]:
In [427]:
df_original = df.copy()
In [884]:
# Convert features to floats since MinMaxScaler does not like int64's
X_original = df.drop(['poi'], axis=1).astype(float)
y_original = df['poi']
# Drop any row that has only zeros in it, drop from labels first, then from features
y_original = y_original[X_original.abs().sum(axis=1) != 0]
X_original = X_original[X_original.abs().sum(axis=1) != 0]
# Save the names of the features
X_names = X_original.columns
#X_original = X_original.apply(lambda x: x.fillna(0), axis=0)
# Scale the features
standardized = MinMaxScaler().fit_transform(X_original)
# Score the features using a classification scoring function using
# the Anova F-value for the provided sample
selection = SelectKBest(k='all', score_func=f_classif).fit(standardized, y_original)
#new_X = selection.transform(standardized)
#KBestNames = X_names[selection.get_support()]
# Create a pd.DataFrame of the names and scores
scores = pd.DataFrame([X_names, selection.scores_])
scores = scores.T
scores.columns = ['Features', 'Scores']
scores = scores.sort(['Scores'], ascending=False).reset_index(drop=True)
scores
Out[884]:
In [885]:
topKBest = list(scores.Features[0:17])
topKBest
Out[885]:
In [867]:
ET_selection = ExtraTreesClassifier(n_estimators=1000).fit(standardized, y_original)
#print ET_selection.feature_importances_
ET_new_X = selection.transform(standardized)
# Create a pd.DataFrame of the names and importances
scores = pd.DataFrame(ET_selection.feature_importances_, index=X_names)
#scores = scores.T
scores.columns = ['Importance']
scores = scores.sort(['Importance'], ascending=False)
print "TOP10: \n", list(scores.index[0:9])
print scores
scores.sort(['Importance'], ascending=True).plot(kind='barh')
Out[867]:
In [868]:
In [ ]:
In [503]:
for i in range(10):
sys.stdout.write('{0}..'.format(i))
sys.stdout.flush()
time.sleep(.1)
In [6]:
# Replace with index watcher
# A quick look at the original finanical spreadsheet shows TOTAL at the bottom
# totaling all entries for everyone. This is obviously an outlier with no
# meaningful information and can be removed.
# df[df['salary'] > 1000000]
# df[df.index == 'TOTAL']
df = df.drop('TOTAL', axis=0)
In [ ]:
By default, the GridSearchCV uses a 3-fold cross-validation. However, if it detects that a classifier is passed, rather than a regressor, it uses a stratified 3-fold.
http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
In [7]:
# low_var_remover = VarianceThreshold(threshold=.5)
In [8]:
# ************************
# Encode as 0 instead.
# Remove columns with more than 50% NA's
# df_50 = df.dropna(axis=1, thresh=len(df)/2)
# ************************
# Since email_address and poi are True/False, every record should have at least 2 non-NA.
# We'll next remove any rows that don't have at least 2 non-NA values besides these.
# The criteria is: No more than 11 NA's per row.
# df_50 = df_50.dropna(axis=0, thresh=5)
# 128 records remain.
# df_50.info()
When looking at the source of the data, the NA entries in the financial data seem values that are reported as zero since all payments/stock values add up to the total payments/stocks values. These NA values should then be set to 0 to add up to the totals reported by the accounting spreadsheet.
The missing values for NA's for email statistics may be a little more subjective.
Some email statistics are features created with prior knowledge of the entire dataset (i.e. emails to/from poi's). This may be data snooping, since if new data/pois were somehow introduced, it would not be possible to generate these features without prior knowledge of which new data were the poi's.
NA's here imply that the person did not have an email account with Enron, or were not involved in emailing by some other way.
This means all email data features ar NA if even one column had missing email data for that person. It is hard to judge any distribution that they could have if they were given an email account since they have no ties to the financial data to infer distributions.
We have no real way to infer a person having sent/recieved 10 emails or 10,000 from completely unrelated financial data from a different dataset with many different people.
For this reason, these NA will also be encoded as 0.
In [9]:
df = df.apply(lambda x: x.fillna(0), axis=0)
In [ ]:
In [570]:
import seaborn as sns
sns.set(style='darkgrid')
f, ax = plt.subplots(figsize=(14, 14))
cmap = sns.diverging_palette(10, 220, as_cmap=True)
sns.corrplot(df.corr(), annot=True, sig_stars=False,
diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [587]:
corrs = df.corr()
corrs.sort(['poi'], ascending=False)['poi']
Out[587]:
In [11]:
# Pick a column which we are predicting.
# Find other variables correlated to used KMeansNeighborsRegression to predict/impute
# the missing values.
# df_50.corr().ix[: ,'salary']
In [12]:
# cols1 = ['salary', 'other', 'total_stock_value', 'exercised_stock_options',
# 'total_payments', 'restricted_stock']
# Bonus and salary values don't seem to be missing at random. Anytime there is a null value
# for salary, there is also one for bonus. So bonus can't be used to predict salary on
# the first pass. Predicted salary values will be used to predict bonus values though
# on a second pass.
# cols2= ['salary', 'other', 'total_stock_value', 'exercised_stock_options',
# 'total_payments', 'restricted_stock', 'bonus']
# cols3 = ['to_messages', 'from_this_person_to_poi', 'from_messages',
# 'shared_receipt_with_poi', 'from_poi_to_this_person']
In [13]:
def kcluster_null(df=None, cols=None, process_all=True):
'''
Input: Takes pandas dataframe with values to impute, and a list of columns to impute
and use for imputing.
Returns: Pandas dataframe with null values imputed for list of columns passed in.
# Ideally columns should be somewhat correlated since they will be used in KNN to
# predict each other, one column at a time.
'''
# Create a KNN regression estimator for
income_imputer = KNeighborsRegressor(n_neighbors=1)
# Loops through the columns passed in to impute each one sequentially.
if not process_all:
to_pred = cols[0]
predictor_cols = cols[1:]
for each in cols:
# Create a temp list that does not include the column being predicted.
temp_cols = [col for col in cols if col != each]
# Create a dataframe that contains no missing values in the columns being predicted.
# This will be used to train the KNN estimator.
df_col = df[df[each].isnull()==False]
# Create a dataframe with all of the nulls in the column being predicted.
df_null_col = df[df[each].isnull()==True]
# Create a temp dataframe filling in the medians for each column being used to
# predict that is missing values.
# This step is needed since we have so many missing values distributed through
# all of the columns.
temp_df_medians = df_col[temp_cols].apply(lambda x: x.fillna(x.median()), axis=0)
# Fit our KNN imputer to this dataframe now that we have values for every column.
income_imputer.fit(temp_df_medians, df_col[each])
# Fill the df (that has null values being predicted) with medians in the other
# columns not being predicted.
# ** This currently uses its own medians and should ideally use the predictor df's
# ** median values to fill in NA's of columns being used to predict.
temp_null_medians = df_null_col[temp_cols].apply(lambda x: x.fillna(x.median()), axis=0)
# Predict the null values for the current 'each' variable.
new_values = income_imputer.predict(temp_null_medians[temp_cols])
# Replace the null values of the original null dataframe with the predicted values.
df_null_col[each] = new_values
# Append the new predicted nulls dataframe to the dataframe which containined
# no null values.
# Overwrite the original df with this one containing predicted columns.
# Index order will not be preserved since it is rearranging each time by
# null values.
df = df_col.append(df_null_col)
# Returned final dataframe sorted by the index names.
return df.sort_index(axis=0)
In [ ]:
In [14]:
df.irow(127)
Out[14]:
In [15]:
#cols = [x for x in df.columns]
#for each in cols:
# g = sns.FacetGrid(df, col='poi', margin_titles=True, size=6)
# g.map(plt.hist, each, color='steelblue')
In [16]:
from pandas.tools.plotting import scatter_matrix
In [17]:
list(df.columns)
Out[17]:
In [18]:
financial_cols = np.array(['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options',
'bonus', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income',
'long_term_incentive'])
email_cols = np.array(['from_messages', 'to_messages', 'shared_receipt_with_poi',
'from_this_person_to_poi', 'from_poi_to_this_person', 'email_address'])
In [19]:
from sklearn.ensemble import RandomForestClassifier
In [20]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(df[financial_cols], df['poi'])
Out[20]:
In [21]:
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)
In [22]:
padding = np.arange(len(financial_cols)) + 0.5
plt.figure(figsize=(14, 12))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, financial_cols[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()
In [23]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(df[email_cols], df['poi'])
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)
padding = np.arange(len(email_cols)) + 0.5
plt.figure(figsize=(14, 12))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, email_cols[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()
In [24]:
all_cols = np.concatenate([email_cols, financial_cols])
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(df[all_cols], df['poi'])
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)
padding = np.arange(len(all_cols)) + 0.5
plt.figure(figsize=(14, 12))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, all_cols[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()
In [25]:
df['ex_stock_bins'] = pd.cut(df.exercised_stock_options, bins=15, labels=False)
pd.value_counts(df.ex_stock_bins)
Out[25]:
In [26]:
df.exercised_stock_options.plot()
Out[26]:
In [27]:
def capValues(x, cap):
return (cap if x > cap else x)
In [28]:
df.exercised_stock_options = df.exercised_stock_options.apply(lambda x: capValues(x, 5000000))
In [29]:
df['ex_stock_bins'] = pd.cut(df.exercised_stock_options, bins=15, labels=False)
pd.value_counts(df.ex_stock_bins)
Out[29]:
In [30]:
df[['ex_stock_bins', 'poi']].groupby('ex_stock_bins').mean().plot()
Out[30]:
In [31]:
df.columns
Out[31]:
In [32]:
df[['bonus', 'poi']].groupby('bonus').mean().plot()
Out[32]:
In [33]:
df.shared_receipt_with_poi.plot()
Out[33]:
In [34]:
max(df.shared_receipt_with_poi)
Out[34]:
In [35]:
# Create bins for shared receipt with poi
my_bins = [min(df.shared_receipt_with_poi)] + [250] + range(500, 5000, 500) + [max(df.shared_receipt_with_poi)]
df['shared_poi_bins'] = pd.cut(df.shared_receipt_with_poi, bins=my_bins, labels=False, include_lowest=True)
pd.value_counts(df['shared_poi_bins'])
Out[35]:
In [ ]:
In [36]:
df[['shared_poi_bins', 'poi']].groupby('shared_poi_bins').mean().plot()
Out[36]:
In [37]:
df.total_stock_value
Out[37]:
In [ ]:
In [38]:
from sklearn.preprocessing import StandardScaler
df['total_stock_scaled'] = StandardScaler().fit_transform(df[['total_stock_value']])
df['bonus_scaled'] = StandardScaler().fit_transform(df[['bonus']])
print df.total_stock_scaled.describe() plt.hist(df.total_stock_scaled)
In [39]:
def dont_neg_log(x):
if x >=0:
return np.log1p(x)
else:
return 0
df['stock_log'] = df['total_stock_value'].apply(lambda x: dont_neg_log(x))
In [561]:
financial_cols = np.array(['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options',
'bonus', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income',
'long_term_incentive'])
email_cols = np.array(['from_messages', 'to_messages', 'shared_receipt_with_poi',
'from_this_person_to_poi', 'from_poi_to_this_person', 'email_address'])
In [562]:
payment_comp = ['salary', 'deferral_payments','bonus', 'expenses', 'loan_advances',
'other', 'director_fees', 'deferred_income', 'long_term_incentive']
payment_total = ['total_payments']
stock_comp = ['exercised_stock_options', 'restricted_stock','restricted_stock_deferred',]
stock_total = ['total_stock_value']
all_comp = payment_comp + stock_comp
email_comp = ['shared_receipt_with_poi', 'from_this_person_to_poi', 'from_poi_to_this_person' ]
email_totals = ['from_messages', 'to_messages'] # interaction_w_poi = total(from/to/shared poi)
In [ ]:
In [636]:
df['total_compensation'] = df['total_payments'] + df['total_stock_value']
for each in payment_comp:
df['{0}_{1}_ratio'.format(each, 'total_pay')] = df[each]/df['total_payments']
for each in stock_comp:
df['{0}_{1}_ratio'.format(each, 'total_stock')] = df[each]/df['total_stock_value']
for each in all_comp:
df['{0}_{1}_ratio'.format(each, 'total_compensation')] = df[each]/df['total_compensation']
df['total_poi_interaction'] = df['shared_receipt_with_poi'] + df['from_this_person_to_poi'] + \
df['from_poi_to_this_person']
for each in email_comp:
df['{0}_{1}_ratio'.format(each, 'total_poi_int')] = df[each]/df['total_poi_interaction']
df['total_active_poi_interaction'] = df['from_this_person_to_poi'] + df['from_poi_to_this_person']
df['to_poi_total_active_poi_ratio'] = df['from_this_person_to_poi']/df['total_active_poi_interaction']
df['from_poi_total_active_poi_ratio'] = df['from_poi_to_this_person']/df['total_active_poi_interaction']
df['to_messages_to_poi_ratio'] = df['from_this_person_to_poi']/ df['to_messages']
df['from_messages_from_poi_ratio'] = df['from_poi_to_this_person']/df['from_messages']
df['shared_poi_from_messages_ratio'] = df['shared_receipt_with_poi']/df['from_messages']
df['shared_poi_total_compensation'] = df['shared_receipt_with_poi']/df['total_compensation']
df['bonus_by_total_stock'] = df['bonus']/df['total_stock_value']
## Add squared features
for each in all_comp:
df['{0}_squared'.format(each)] = df[each]**2
for each in email_comp:
df['{0}_squared'.format(each)] = df[each]**2
A good portion of people were paid either only in stock or payments. Another good portion also didn't have email statistics available.
These ratios will need to be set to zero manually due to division by 0 - NaN.
In [640]:
df = df.apply(lambda x: x.fillna(0), axis=0)
In [644]:
In [565]:
df[['poi', 'director_fees_total_pay_ratio', 'director_fees', 'total_payments']]
Out[565]:
In [566]:
df[df['poi']==True]
Out[566]:
director_fees_total_pay_ratio, deferred_income_total_pay_ratio, exercised_stock_options_total_stock_ratio, exercised_stock_options_total_stock_ratio, restricted_stock_deferred_total_stock_ratio, restricted_stock_total_stock_ratio, director_fees_total_compensation_ratio, deferred_income_total_compensation_ratio, restricted_stock_total_compensation_ratio, restricted_stock_deferred_total_compensation_ratio
In [647]:
df = df.replace([np.inf, -np.inf], 0)
In [47]:
#df.ix[20:30, 30:40]
In [48]:
# Column/row slicing by number
# df.ix[11,:]
In [796]:
#all_cols2 = np.concatenate([all_cols, np.array(['shared_poi_bins', 'ex_stock_bins',
# 'total_stock_scaled', 'bonus_scaled',
# 'stock_log'])])
# from_messages_from_poi_to_ratio
features = np.array(df.drop('poi', axis=1).columns)
clf = ExtraTreesClassifier(n_estimators=3000)
clf.fit(df[features], df['poi'])
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)
padding = np.arange(len(features)) + 0.5
plt.figure(figsize=(16,14))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, features[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
top10_features_RF = ['bonus', 'total_stock_value', 'other', 'total_compensation', 'expenses',
'other_total_pay_ratio', 'from_messages_from_poi_ratio', 'restricted_stock',
'shared_poi_from_messages_ratio', 'total_payments']
top10_features_ET = ['exercised_stock_options_squared', 'total_stock_value', 'bonus_total_pay_ratio',
'long_term_incentive_total_pay_ratio', 'bonus', 'deferred_income',
'total_compensation', 'to_messages_to_poi_ratio',
'from_messages_from_poi_ratio', 'to_messages_to_poi_ratio', 'other_total_pay_ratio',
'salary_squared', 'other']
In [ ]:
In [569]:
confusion_matrix(df['poi'], clf.predict(df[features]))
Out[569]:
In [206]:
#X_df = df.drop('poi', axis=1)
#y_df = df['poi']
#selector = SelectKBest(k=12, score_func=f_classif)
#selector = selector.fit_transform(X_df, y_df)
#selector
Out[206]:
In [ ]:
FINANCIAL_FIELDS = ['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options',
'bonus', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income',
'long_term_incentive', 'ex_stock_bins', 'stock_log']
EMAIL_FIELDS = ['from_messages', 'to_messages', 'shared_receipt_with_poi',
'from_this_person_to_poi', 'from_poi_to_this_person', 'email_address',
'shared_poi_bins']
In [222]:
class ColumnExtractor(TransformerMixin):
'''
Columns extractor transformer for sklearn pipelines.
Inherits fit_transform() from TransformerMixin, but this is explicitly
defined here for clarity.
Methods to extract pandas dataframe columns are defined for this class.
'''
def __init__(self, columns=[]):
self.columns = columns
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def transform(self, X, **transform_params):
'''
Input: A pandas dataframe and a list of column names to extract.
Output: A pandas dataframe containing only the columns of the names passed in.
'''
return X[self.columns]
def fit(self, X, y=None, **fit_params):
return self
def get_params(self, deep=True):
"""Get parameters for this estimator.
Parameters
----------
deep: boolean, optional
If True, will return the parameters for this estimator and
contained subobjects that are estimators.
Returns
-------
params : mapping of string to any
Parameter names mapped to their values.
"""
return self
In [545]:
top10_features_ET
top10 = ['exercised_stock_options', 'total_stock_value', 'bonus', 'salary', 'deferred_income',
'long_term_incentive', 'restricted_stock', 'total_payments', 'loan_advances',
'shared_receipt_with_poi','total_compensation', 'from_messages_from_poi_ratio']
In [ ]:
In [936]:
#X_df = df[['total_payments', 'total_stock_value', 'shared_receipt_with_poi', 'bonus']].astype(float)
X_df = df.drop('poi', axis=1).astype(float)
#X_df = df[top10_features_ET]
#X_df = df[topKBest].astype(float)
y_df = df['poi']
y_df = y_df[X_df.abs().sum(axis=1) != 0]
X_df = X_df[X_df.abs().sum(axis=1) != 0]
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.lda import LDA
from sklearn.linear_model import Lars
from sklearn.linear_model import SGDClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
sk_fold = StratifiedShuffleSplit(y_df, n_iter=100, test_size=0.1)
pipeline = Pipeline(steps=[#('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy="median", verbose=0)),
#('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
('minmaxer', MinMaxScaler()),
#('low_var_remover', VarianceThreshold()),
('selection', SelectKBest(score_func=f_classif)),
('reducer', PCA()),
#('classifier', LinearSVC(penalty='l1', dual=False)),
#('KMeans', KMeans(n_clusters=2))
('classifier', LogisticRegression())
#('classifier2', SGDClassifier(n_iter=300))
]) # ,
#('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
# criterion='gini', n_estimators=1500, n_jobs=1,
# oob_score=True, random_state=None, verbose=0,
# max_features='auto', min_samples_split=2,
# min_samples_leaf=1))])
params = {
#'ET__n_estimators': [1500],
#'ET__max_features': ['auto', None, 3, 5, 10, 20],
#'ET__min_samples_split': [2, 4, 10],
#'ET__min_samples_leaf': [1, 2, 5],
'selection__k': [20, 17, 15],
'classifier__C': [1, 10, 100, 1000],
#'classifier2__alpha': [0.0001, 0.001],
#'classifier2__loss': ['hinge', 'log', 'modified_huber'],
#'classifier2__class_weight': [{True: 4, False: 1}, {True: 10, False: 1}],
#'classifier__penalty': ['l1', 'l2'],
'classifier__class_weight': [{True: 12, False: 1}, {True: 10, False: 1}, {True: 8, False: 1}],
'classifier__tol': [1e-1, 1e-2, 1e-4, 1e-8, 1e-16, 1e-32],
'reducer__n_components': [1, 2, 3, 4, 5],
'reducer__whiten': [True, False]
#'feature_selection__k': [3, 5, 10, 20]
#'ET__criterion' : ['gini', 'entropy'],
#'imputer__strategy': ['median', 'mean'],
#'low_var_remover__threshold': [0, 0.1, .25, .50, .75, .90, .99]
}
# Scoring: average_precision, roc_auc, f1, recall, precision
grid_search = GridSearchCV(pipeline, param_grid=params, cv=sk_fold, n_jobs = 1, scoring='f1')
grid_search.fit(X_df, y=y_df)
#test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
print "Best Estimator: ", grid_search.best_estimator_
#f1_avg.append(f1_score(y_test, test_pred))
#print "F1: ", f1_score(y_test, test_pred)
#print "Confusion Matrix: "
#print confusion_matrix(y_test, test_pred)
#print "Accuracy Score: ", accuracy_score(y_test, test_pred)
print "Best Params: ", grid_search.best_params_
In [937]:
n_iter = 1000
sk_fold = StratifiedShuffleSplit(y_df, n_iter=n_iter, test_size=0.1)
f1_avg = []
recall_avg = []
precision_avg = []
for i, all_index in enumerate(sk_fold):
train_index = all_index[0]
test_index = all_index[1]
X_train, X_test = X_df.irow(train_index), X_df.irow(test_index)
y_train, y_test = y_df[train_index], y_df[test_index]
grid_search.best_estimator_.fit(X_train, y=y_train)
# pipeline.fit(X_train, y=y_train)
test_pred = grid_search.predict(X_test)
#test_pred = pipeline.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
#print "Best Estimator: ", grid_search.best_estimator_
#print f1_score(y_test, test_pred)
if i % round(n_iter/10) == 0:
sys.stdout.write('{0}%..'.format(float(i)/n_iter*100))
sys.stdout.flush()
f1_avg.append(f1_score(y_test, test_pred))
precision_avg.append(precision_score(y_test, test_pred))
recall_avg.append(recall_score(y_test, test_pred))
print "Done!"
print ""
print "F1 Avg: ", sum(f1_avg)/n_iter
print "Precision Avg: ", sum(precision_avg)/n_iter
print "Recall Avg: ", sum(recall_avg)/n_iter
In [ ]:
F1 Avg: 0.309882173382 Precision Avg: 0.226065462315
Best Estimator: Pipeline(steps=[('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('low_var_remover', VarianceThreshold(threshold=0.1)), ('classifier', LinearSVC(C=0.1, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1', random_state=None, tol=1e-07, verbose=0))]) Best Params: {'classifierclass_weight': 'auto', 'low_var_removerthreshold': 0.1, 'classifierC': 0.1, 'classifiertol': 1e-07}
F1 Avg: 0.39108035853 Precision Avg: 0.263075613276
Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=5, whiten=True)), ('classifier', LogisticRegression(C=0.01, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.01))]) Best Params: {'reducerwhiten': True, 'classifierclass_weight': 'auto', 'classifierC': 0.01, 'reducern_components': 5, 'classifier__tol': 0.01}
F1 Avg: 0.408565806416 Precision Avg: 0.301739249639
Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=5, whiten=False)), ('classifier2', SGDClassifier(alpha=0.0001, class_weight='auto', epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=300, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False))]) Best Params: {'reducern_components': 5, 'classifier2alpha': 0.0001, 'classifier2class_weight': 'auto', 'classifier2loss': 'hinge', 'reducerwhiten': False, 'classifier2penalty': 'elasticnet'}
F1 Avg: 0.293634931735 Precision Avg: 0.219107395382
Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('classifier', LinearSVC(C=1, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1', random_state=None, tol=1e-08, verbose=0)), ('classifier2', SGDClassifier(a..., penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False))]) Best Params: {'classifier2alpha': 0.001, 'classifierclass_weight': 'auto', 'classifier2class_weight': 'auto', 'classifier2loss': 'hinge', 'classifiertol': 1e-08, 'classifier2penalty': 'l2', 'classifier__C': 1}
F1 Avg: 0.392249062049 Precision Avg: 0.300678174603
Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=4, whiten=True)), ('classifier', LogisticRegression(C=10, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001))]) Best Params: {'reducern_components': 4, 'classifierclass_weight': 'auto', 'classifiertol': 0.0001, 'reducerwhiten': True, 'classifierC': 10, 'classifierpenalty': 'l2'}
F1 Avg: 0.461406277056 Precision Avg: 0.364574206349
Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=1, whiten=True)), ('classifier', LogisticRegression(C=100, class_weight={False: 1, True: 8}, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.1))]) Best Params: {'reducerwhiten': True, 'classifierclass_weight': {False: 1, True: 8}, 'classifierC': 100, 'reducern_components': 1, 'classifier__tol': 0.1}
In [ ]:
pipeline = Pipeline(steps=[#('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)),
#('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
#('low_var_remover', VarianceThreshold(threshold=0.1)),
#('feature_selection', LinearSVC()),
('features', FeatureUnion([
('financial', Pipeline([
('extract', ColumnExtractor(FINANCIAL_FIELDS)),
('scale', StandardScaler()),
('reduce', LinearSVC())
])),
('email', Pipeline([
('extract2', ColumnExtractor(EMAIL_FIELDS)),
('scale2', StandardScaler()),
('reduce2', LinearSVC())
]))
])),
('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
criterion='gini', n_estimators=1500, n_jobs=1,
oob_score=True, random_state=None, verbose=0,
max_features=None, min_samples_split=2,
min_samples_leaf=1))
])
In [938]:
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\tFalse negatives: {:4d}\tTrue negatives: {:4d}"
def test_classifier(clf, dataset, feature_list, folds = 1000):
#data = featureFormat(dataset, feature_list, sort_keys = True)
#labels, features = targetFeatureSplit(data)
labels = y_df
features = X_df
cv = StratifiedShuffleSplit(labels, n_iter=folds, random_state = 42)
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
for train_idx, test_idx in cv:
features_train = []
features_test = []
labels_train = []
labels_test = []
#for ii in train_idx:
# features_train.append( features[ii] )
# labels_train.append( labels[ii] )
#for jj in test_idx:
# features_test.append( features[jj] )
# labels_test.append( labels[jj] )
features_train, features_test = features.irow(train_index), features.irow(test_index)
labels_train, labels_test = labels[train_index], labels[test_index]
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
predictions = clf.predict(features_test)
for prediction, truth in zip(predictions, labels_test):
if prediction == 0 and truth == 0:
true_negatives += 1
elif prediction == 0 and truth == 1:
false_negatives += 1
elif prediction == 1 and truth == 0:
false_positives += 1
else:
true_positives += 1
try:
total_predictions = true_negatives + false_negatives + false_positives + true_positives
accuracy = 1.0*(true_positives + true_negatives)/total_predictions
precision = 1.0*true_positives/(true_positives+false_positives)
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
print clf
print ""
print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
print ""
except:
print "Got a divide by zero when trying out:", clf
In [939]:
clf = Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))),
('reducer', PCA(copy=True, n_components=4, whiten=True)),
('classifier', LogisticRegression(C=10, class_weight='auto',
dual=False, fit_intercept=True,
intercept_scaling=1, penalty='l2',
random_state=None, tol=0.0001))])
In [940]:
#test_classifier(clf, None, None, folds=1000)
test_classifier(grid_search.best_estimator_, None, None, folds=1000)
In [784]:
#test_classifier(clf, None, None, folds=1000)
In [ ]:
In [ ]:
#!/usr/bin/python
import sys
import pickle
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features
### Load the dictionary containing the dataset
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB() # Provided to give you a starting point. Try a varity of classifiers.
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
test_classifier(clf, my_dataset, features_list)
### Dump your classifier, dataset, and features_list so
### anyone can run/check your results.
dump_classifier_and_data(clf, my_dataset, features_list)
In [ ]: